"""
html utils with BeautifulSoup and so on
thanks for BeautifulSoup
"""

from BeautifulSoup import *
from htmlentities import *
from spider.algorithm import *
from spider.filesystem import *


def parse(html):
	return BeautifulSoup(html)


def extract_body_text(html, rate=1):
	lst=html.split("\n")
	lst=[l.strip() for l in lst]
	lnum=[len(l) for l in lst]
	
	w=0
	for i in range(len(lnum)):
		now=lnum[i]
		if now==0:
			w+=1
			lnum[i]=-w*rate
		if now!=0 and w!=0:
			for j in range(1, int(w/2)+1):
				lnum[i-j]=-j*rate
				lnum[i-(w+1-j)]=-j*rate
			w=0
			
	max_sum, begin, end=max_sub_seq_sum(lnum)
	# print lnum
	return filter(lambda i:len(i)>0, lst[begin:end+1])